---
title: "Study 2 Cleaning"
author: "Kaylyn Jackson Schiff, Daniel Schiff, and Natalia Bueno"
date: "2020"
output: pdf_document
editor_options: 
  chunk_output_type: console
---

#####Setup Chunk
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
require(rmarkdown)
require(skimr)
require(readr)
require(tidyverse)
options(scipen=999)
```

#####Functions
```{r functions}
set.seed(1234)
calculate_mean_effects_index <- function(Z, outcome_mat, to_reorient, reorient = FALSE, greedy = TRUE,
                                         impute = FALSE){
  if(length(Z) != nrow(outcome_mat)) stop("Error: Treatment assignment, outcome matrix require same n!")
  if(impute == TRUE){
    R <- 1 * is.na(outcome_mat)
    means_for_imputation <- rbind(apply(outcome_mat[Z==0,], MAR = 2, FUN = mean, na.rm = T),
                                  apply(outcome_mat[Z==1,], MAR = 2, FUN = mean, na.rm = T))
    to_impute <- R * means_for_imputation[Z+1,]
    outcome_mat[is.na(outcome_mat)] <- 0
    outcome_mat <- outcome_mat + to_impute
  }
  c_mean <- apply(X = outcome_mat[Z==0,], MARGIN = 2, FUN = mean, na.rm = T)
  c_sd <- apply(X = outcome_mat[Z==0,], MARGIN = 2, FUN = sd, na.rm = T)
  z_score <- t(t(sweep(outcome_mat, 2, c_mean))/ c_sd)
  index_numerator <- rowSums(z_score)
  if(greedy == TRUE){
    n_outcomes <- rowSums(!is.na(z_score))
  }
  else if(greedy == FALSE){
    n_outcomes <- ncol(outcome_mat)
  }
  index <- index_numerator/n_outcomes
  index <-  (index - mean(index[Z==0], na.rm =T))/sd(index[Z==0], na.rm =T)
  return(index)
}
```

#####Import Data
```{r import data}
#Imports data and removes PII variable and saves data for use.
# df_names <- read_csv("data/lucid_sample_5.3.21.csv", n_max = 0) %>% names()
# df_raw <- read_csv("data/lucid_sample_5.3.21.csv", col_names = df_names, skip = 3)
# df <- df_raw %>% filter(Finished==1) %>% as_tibble
# df <- df_raw

#Excluding zip data
#df <- df %>% select(-zip)
#saveRDS(df, "data/lucid_sample_5.3.21.rds")

df <- readRDS("data/lucid_sample_5.3.21.Rds")

```

#####Explore Data
```{r explore data}
df
glimpse(df)
colnames(df)
```

#####Subset Data
```{r subset data}
df <- df %>% dplyr::select(ResponseId,                                                #respondent data
                      screener_1_1:screener_1_21, screener_2_1:screener_2_6,            #screening questions
                      
                      #first experiment (IU vs. control vs. IU + FC)
                      politician_1,                                                       #politician name
                      IU, Control_1, FC,                                                  #allegations  
                      
                      belief_1, belief_2,                                               #belief outcomes
                      support_1, support_2, support_3, support_4,                       #support outcomes
                      trust_1, trust_2,                                                 #trust outcomes
                      
                      offensive, uncertainty,                                          #new outcomes for all
                      belief_alleg, support_alleg, share_alleg,                        #new outcomes for IU and FC
                      belief_FC, support_FC,                                           #new outcomes for FC
                      
                      #second experiment (OR vs. control)
                      politician_2,                                                       #politician name
                      OR, Control_2,                                                   #allegations  
                      
                      support_1_OR, support_2_OR, support_3_OR, support_4_OR,           #support outcomes
                      
                      offensive_OR,                                                       #new outcomes for all

                      #both experiments
                      literacy1, literacy2, literacy3, familiar_df,                    #news media and digital literacy
                      
                      concern_FN, detect_FN,                                           #new covariates
                      cancel_culture, political_correctness, forgiveness,
                      
                      trust_gov, trust_pol, trust_others, trust_opp,                   #new exploratory outcomes
                      clicked1,
                      
                      party, political_party, gender, race, age, education_2, hhi, region,               #demographics

)                 

skim(df)                                                   #summary statistics and class
```

#####Recode and Create Variables
```{r recode and create new variables}
#Create attentiveness index
df <- df %>% mutate(screener_1 = if_else((!is.na(screener_1_21) & screener_1_21 == 1) &
                                                  is.na(screener_1_1) ==  TRUE &
                                                  is.na(screener_1_2) ==  TRUE &
                                                  is.na(screener_1_3) ==  TRUE &
                                                  is.na(screener_1_4) ==  TRUE &
                                                  is.na(screener_1_5) ==  TRUE &
                                                  is.na(screener_1_6) ==  TRUE &
                                                  is.na(screener_1_7) ==  TRUE &
                                                  is.na(screener_1_8) ==  TRUE &
                                                  is.na(screener_1_9) ==  TRUE &
                                                  is.na(screener_1_10) == TRUE &
                                                  is.na(screener_1_11) == TRUE &
                                                  is.na(screener_1_12) == TRUE &
                                                  is.na(screener_1_13) == TRUE &
                                                  is.na(screener_1_14) == TRUE &
                                                  is.na(screener_1_15) == TRUE &
                                                  is.na(screener_1_16) == TRUE &
                                                  is.na(screener_1_17) == TRUE &
                                                  is.na(screener_1_18) == TRUE &
                                                  is.na(screener_1_19) == TRUE &
                                                  is.na(screener_1_20) == TRUE, 1, 0))

df <- df %>% dplyr::mutate(screener_2 = if_else(is.na(screener_2_1) &
                                         is.na(screener_2_2) &
                                         (!is.na(screener_2_3) & screener_2_3 == 1) &
                                         is.na(screener_2_4) &
                                         (!is.na(screener_2_5) & screener_2_5 == 1)&
                                         is.na(screener_2_6), 1, 0))

df <- df %>% dplyr::mutate(attentiveness = rowSums(dplyr::select(df, screener_1, screener_2))) #create index

#Create variables for treatment assignments
df <- df %>% mutate(alleg_treatment_1 = case_when(
  Control_1 == 1 ~ "Control",
  IU == 1 ~ "IU",
  FC == 1 ~ "FC"
))
df$alleg_treatment_1 <- factor(df$alleg_treatment_1, levels = c("Control","IU","FC"), 
                     labels = c("Control","Info. Uncertain","Fact Check"), ordered = FALSE)

df <- df %>% mutate(alleg_treatment_2 = case_when(
  Control_2 == 1 ~ "Control",
  OR == 1 ~ "OR"
))
df$alleg_treatment_2 <- factor(df$alleg_treatment_2, levels = c("Control", "OR"), 
                     labels = c("Control","Opp. Rally"), ordered = FALSE)

#Recode political ideology
df <- df %>% mutate(party_3 = case_when(
  party == 1 | party == 2 ~ 1,                 # "Democrat",
  party == 3 | party == 4 | party == 5 ~ 2,    # "Independent",
  party == 6 | party == 7 ~ 3                  # "Republican"
))
df$party_3 <- factor(df$party_3, levels = c(2,1,3),    #Independent as reference category
                    labels = c("Independent","Democrat","Republican"),
                    ordered = FALSE)

df <- df %>% mutate(party_3_lean = case_when(
  party == 1 | party == 2 | party==3 ~ 1,                 # "Democrat",
  party == 4 ~ 2,    # "Independent",
  party == 5 | party == 6 | party == 7 ~ 3                  # "Republican"
))
df$party_3_lean <- factor(df$party_3_lean, levels = c(2,1,3),    #Independent as reference category
                    labels = c("Independent","Democrat","Republican"),
                    ordered = FALSE)

df$party <- factor(df$party, levels = c(4,1,2,3,5,6,7), 
                     labels = c("Independent","Strong Democrat","Democrat","Lean Democrat",
                                "Lean Republican","Republican","Strong Republican"), ordered = FALSE)

#Recode pre-treatment political ideology
df <- df %>% mutate(pre_party_3 = case_when(
  political_party %in% c(1,2) ~ 1,                 # "Democrat",
  political_party %in% c(3:8) ~ 2,    # "Independent",
  political_party %in% c(9,10) ~ 3                  # "Republican"
))
df$pre_party_3 <- factor(df$pre_party_3, levels = c(2,1,3),    #Independent as reference category
                    labels = c("Independent","Democrat","Republican"),
                    ordered = FALSE)

df <- df %>% mutate(pre_party_3_lean = case_when(
  political_party %in% c(1,2,3,6) ~ 1,                 # "Democrat",
  political_party %in% c(4,7) ~ 2,    # "Independent",
  political_party %in% c(5,8,9,10) ~ 3                  # "Republican"
))
df$pre_party_3_lean <- factor(df$pre_party_3_lean, levels = c(2,1,3),    #Independent as reference category
                    labels = c("Independent","Democrat","Republican"),
                    ordered = FALSE)

#Create strong co-partisanship variable
df <- df %>% mutate(copartisan_1 = case_when(
  party_3 == "Democrat" & (politician_1 == "Jesse Jackson" | politician_1 == "John Murtha") ~ 1, #co-partisans
  party_3 == "Democrat" & (politician_1 == "Todd Akin" | politician_1 == "Tim James") ~ 0, #anti-partisans
  party_3 == "Republican" & (politician_1 == "Todd Akin" | politician_1 == "Tim James") ~ 1, 
  party_3 == "Republican" & (politician_1 == "Jesse Jackson" | politician_1 == "John Murtha") ~ 0,
  party_3 == "Independent" ~ 0))

df <- df %>% mutate(copartisan_2 = case_when(
  party_3 == "Democrat" & (politician_2 == "Jesse Jackson" | politician_2 == "John Murtha") ~ 1, #co-partisans
  party_3 == "Democrat" & (politician_2 == "Todd Akin" | politician_2 == "Tim James") ~ 0, #anti-partisans
  party_3 == "Republican" & (politician_2 == "Todd Akin" | politician_2 == "Tim James") ~ 1, 
  party_3 == "Republican" & (politician_2 == "Jesse Jackson" | politician_2 == "John Murtha") ~ 0,
  party_3 == "Independent" ~ 0))

#Create moderate partisanship variable
df <- df %>% mutate(moderate = if_else(party_3 == "Independent", 1, 0))

#Create anti-partisanship variable
df <- df %>% mutate(antipartisan_1 = case_when(
  party_3 == "Democrat" & (politician_1 == "Jesse Jackson" | politician_1 == "John Murtha") ~ 0,
  party_3 == "Democrat" & (politician_1 == "Todd Akin" | politician_1 == "Tim James") ~ 1,
  party_3 == "Republican" & (politician_1 == "Todd Akin" | politician_1 == "Tim James") ~ 0, 
  party_3 == "Republican" & (politician_1 == "Jesse Jackson" | politician_1 == "John Murtha") ~ 1,
  party_3 == "Independent" ~ 0))

df <- df %>% mutate(antipartisan_2 = case_when(
  party_3 == "Democrat" & (politician_2 == "Jesse Jackson" | politician_2 == "John Murtha") ~ 0,
  party_3 == "Democrat" & (politician_2 == "Todd Akin" | politician_2 == "Tim James") ~ 1,
  party_3 == "Republican" & (politician_2 == "Todd Akin" | politician_2 == "Tim James") ~ 0, 
  party_3 == "Republican" & (politician_2 == "Jesse Jackson" | politician_2 == "John Murtha") ~ 1,
  party_3 == "Independent" ~ 0))

#Create partisanship variable
df <- df %>% mutate(partisan_1 = case_when(
  party_3 == "Democrat" & (politician_1 == "Jesse Jackson" | politician_1 == "John Murtha") ~ 1,
  party_3 == "Democrat" & (politician_1 == "Todd Akin" | politician_1 == "Tim James") ~ -1,
  party_3 == "Republican" & (politician_1 == "Todd Akin" | politician_1 == "Tim James") ~ 1, 
  party_3 == "Republican" & (politician_1 == "Jesse Jackson" | politician_1 == "John Murtha") ~ -1,
  party_3 == "Independent" ~ 0))

df <- df %>% mutate(partisan_2 = case_when(
  party_3 == "Democrat" & (politician_2 == "Jesse Jackson" | politician_2 == "John Murtha") ~ 1,
  party_3 == "Democrat" & (politician_2 == "Todd Akin" | politician_2 == "Tim James") ~ -1,
  party_3 == "Republican" & (politician_2 == "Todd Akin" | politician_2 == "Tim James") ~ 1, 
  party_3 == "Republican" & (politician_2 == "Jesse Jackson" | politician_2 == "John Murtha") ~ -1,
  party_3 == "Independent" ~ 0))

#Create pre-treatment partisanship variable
df <- df %>% mutate(pre_partisan_1 = case_when(
  pre_party_3 == "Democrat" & (politician_1 == "Jesse Jackson" | politician_1 == "John Murtha") ~ 1,
  pre_party_3 == "Democrat" & (politician_1 == "Todd Akin" | politician_1 == "Tim James") ~ -1,
  pre_party_3 == "Republican" & (politician_1 == "Todd Akin" | politician_1 == "Tim James") ~ 1, 
  pre_party_3 == "Republican" & (politician_1 == "Jesse Jackson" | politician_1 == "John Murtha") ~ -1,
  pre_party_3 == "Independent" ~ 0))

df <- df %>% mutate(pre_partisan_2 = case_when(
  pre_party_3 == "Democrat" & (politician_2 == "Jesse Jackson" | politician_2 == "John Murtha") ~ 1,
  pre_party_3 == "Democrat" & (politician_2 == "Todd Akin" | politician_2 == "Tim James") ~ -1,
  pre_party_3 == "Republican" & (politician_2 == "Todd Akin" | politician_2 == "Tim James") ~ 1, 
  pre_party_3 == "Republican" & (politician_2 == "Jesse Jackson" | politician_2 == "John Murtha") ~ -1,
  pre_party_3 == "Independent" ~ 0))

#Create partisanship variable with leaners
df <- df %>% mutate(partisan_1_lean = case_when(
  party_3_lean == "Democrat" & (politician_1 == "Jesse Jackson" | politician_1 == "John Murtha") ~ 1,
  party_3_lean == "Democrat" & (politician_1 == "Todd Akin" | politician_1 == "Tim James") ~ -1,
  party_3_lean == "Republican" & (politician_1 == "Todd Akin" | politician_1 == "Tim James") ~ 1, 
  party_3_lean == "Republican" & (politician_1 == "Jesse Jackson" | politician_1 == "John Murtha") ~ -1,
  party_3_lean == "Independent" ~ 0))

df <- df %>% mutate(partisan_2_lean = case_when(
  party_3_lean == "Democrat" & (politician_2 == "Jesse Jackson" | politician_2 == "John Murtha") ~ 1,
  party_3_lean == "Democrat" & (politician_2 == "Todd Akin" | politician_2 == "Tim James") ~ -1,
  party_3_lean == "Republican" & (politician_2 == "Todd Akin" | politician_2 == "Tim James") ~ 1, 
  party_3_lean == "Republican" & (politician_2 == "Jesse Jackson" | politician_2 == "John Murtha") ~ -1,
  party_3_lean == "Independent" ~ 0))

#Create pre-treatment partisanship variable with leaners
df <- df %>% mutate(pre_partisan_1_lean = case_when(
  pre_party_3_lean == "Democrat" & (politician_1 == "Jesse Jackson" | politician_1 == "John Murtha") ~ 1,
  pre_party_3_lean == "Democrat" & (politician_1 == "Todd Akin" | politician_1 == "Tim James") ~ -1,
  pre_party_3_lean == "Republican" & (politician_1 == "Todd Akin" | politician_1 == "Tim James") ~ 1, 
  pre_party_3_lean == "Republican" & (politician_1 == "Jesse Jackson" | politician_1 == "John Murtha") ~ -1,
  pre_party_3_lean == "Independent" ~ 0))

df <- df %>% mutate(pre_partisan_2_lean = case_when(
  pre_party_3_lean == "Democrat" & (politician_2 == "Jesse Jackson" | politician_2 == "John Murtha") ~ 1,
  pre_party_3_lean == "Democrat" & (politician_2 == "Todd Akin" | politician_2 == "Tim James") ~ -1,
  pre_party_3_lean == "Republican" & (politician_2 == "Todd Akin" | politician_2 == "Tim James") ~ 1, 
  pre_party_3_lean == "Republican" & (politician_2 == "Jesse Jackson" | politician_2 == "John Murtha") ~ -1,
  pre_party_3_lean == "Independent" ~ 0))

#Recode gender
df$gender <- factor(df$gender, levels = c(1,2), 
                     labels = c("Male","Female"), ordered = FALSE)

#Recode race/ethnicity
df$race <- factor(df$race, levels = c(1,2,3,4,5), 
                  labels = c("White", "Black", "Hispanic", "Asian", "Other"), ordered = FALSE)

#Recode age
df <- df %>% mutate(age2 = case_when(
  age >= 2020-1945                    ~ 5,   
  age <= 2020-1946 & age >= 2020-1964 ~ 4,   
  age <= 2020-1965 & age >= 2020-1980 ~ 3,       
  age <= 2020-1981 & age >= 2020-1996 ~ 2, 
  age <= 2020-1997                    ~ 1          
  ))
df$age <- factor(df$age2, levels = c(1:5),
                     labels = c("Gen Z", "Millennials", "Gen X", "Boomers", "Silent"), ordered = FALSE)

#Recode education
df <- df %>% mutate(education = case_when(
  education_2 <= 2 ~ 1,                      #High school graduate or less
  education_2 >= 3 & education_2 <= 4 ~ 2,   #Some college or Associate degree
  education_2 == 5 ~ 3,                      #Bachelor's degree
  education_2 >= 6 ~ 4                       #Graduate degree 
  ))
df$education <- factor(df$education, levels = c(1,2,3,4), 
                  labels = c("High school graduate or less","Some college","Bachelor's degree","Graduate degree"), 
                  ordered = FALSE)

#Recode income
df$hhi <- ifelse(df$hhi == -3105, mean(df$hhi[df$hhi != -3105]), df$hhi) #recode missing values to mean
df <- df %>% mutate(income = case_when(
  hhi <= 4 & hhi >= 1 ~ 1,           #Under $30k
  hhi >= 5 & hhi <= 13 ~ 2,          #$30k to under $75k
  hhi >= 14 ~ 3                      #At least $75k
  ))
df$income <- factor(df$income, levels = c(2,1,3), 
                  labels = c("Middle income","Low income","High income"), 
                  ordered = FALSE)

#Recode region
df$region <- factor(df$region, levels = c(1:4), 
                  labels = c("Northeast","Midwest","South","West"), 
                  ordered = FALSE)

#Create news media literacy index variable
df <- df %>% mutate(literacy1_correct = if_else(literacy1==2, 1, 0),
                          literacy2_correct = if_else(literacy2==4, 1, 0),
                          literacy3_correct = if_else(literacy3==2, 1, 0))
df <- df %>% mutate(media_literacy = 
                      rowSums(dplyr::select(df, literacy1_correct, literacy2_correct, literacy3_correct)))

#Rename digital literacy
df <- df %>% rename(digital_literacy = familiar_df)

#Create index for belief
df <- df %>% mutate(treat_1 = ifelse(alleg_treatment_1 == "Control", 0, 1))
df <- df %>% mutate(treat_2 = ifelse(alleg_treatment_2 == "Control", 0, 1))
mat <- df %>% dplyr::select(belief_1, belief_2)
df <- df %>% mutate(belief_exp1 = calculate_mean_effects_index(Z = df$treat_1, outcome_mat = mat, reorient = F))

#Create index for support
mat <- df %>% dplyr::select(support_1, support_2, support_3, support_4)
df <- df %>% mutate(support_exp1 = calculate_mean_effects_index(Z = df$treat_1, outcome_mat = mat, reorient = F))

mat <- df %>% dplyr::select(support_1_OR, support_2_OR, support_3_OR, support_4_OR)
df <- df %>% mutate(support_exp2 = calculate_mean_effects_index(Z = df$treat_2, outcome_mat = mat, reorient = F))

#Create index for support no donations
mat <- df %>% dplyr::select(support_1, support_2, support_3)
df <- df %>% mutate(support_exp1_nodonation = calculate_mean_effects_index(Z = df$treat_1, outcome_mat = mat, reorient = F))

mat <- df %>% dplyr::select(support_1_OR, support_2_OR, support_3_OR)
df <- df %>% mutate(support_exp2_nodonation = calculate_mean_effects_index(Z = df$treat_2, outcome_mat = mat, reorient = F))


#Create index for trust
mat <- df %>% dplyr::select(trust_1, trust_2)
df <- df %>% mutate(trust_exp1 = calculate_mean_effects_index(Z = df$treat_1, outcome_mat = mat, reorient = F))


#Create factor variable for share_alleg
df$share_alleg <- factor(df$share_alleg, levels = c(1,2,3,4), 
                  labels = c("No - because I don't support it","No - although I support it",
                             "Yes - but not because I support it", "Yes - because I support it"), 
                  ordered = FALSE)

#Standardize new outcomes
#df$offensive <- scale(df$offensive)
#df$offensive_OR <- scale(df$offensive_OR)
df$uncertainty <- scale(df$uncertainty)
#df$belief_alleg <- scale(df$belief_alleg)
#df$support_alleg <- scale(df$support_alleg)
#df$belief_FC <- scale(df$belief_FC)
#df$support_FC <- scale(df$support_FC)

#Standardize extra covariates
#df$concern_FN <- scale(df$concern_FN)
#df$detect_FN <- scale(df$detect_FN)
#df$cancel_culture <- scale(df$cancel_culture)
#df$political_correctness <- scale(df$political_correctness)
df$accountability <- if_else(df$forgiveness == 2, 1, 0) #recode forgiveness as binary

#Standardize exploratory outcomes
df$trust_gov <- scale(df$trust_gov)
df$trust_pol <- scale(df$trust_pol)
df$trust_others <- scale(df$trust_others)
df$trust_opp <- scale(df$trust_opp)
```

```{r}
#Reselect variables
colnames(df)
df <- df %>% dplyr::select(ResponseId,                                                #respondent data
                      attentiveness,                                                  #attentiveness index
                      
                      #first experiment (IU vs. control vs. IU + FC)
                      politician_1,                                                       #politician name
                      alleg_treatment_1,                                                  #allegations  
                      
                      belief_exp1, support_exp1, trust_exp1,  support_exp1_nodonation,                          #index outcomes
                      belief_1:trust_2,                                                 #individual outcomes
                      
                      offensive, uncertainty,                                          #new outcomes for all
                      belief_alleg, support_alleg, share_alleg,                        #new outcomes for IU and FC
                      belief_FC, support_FC,                                           #new outcomes for FC
                      
                      #second experiment (OR vs. control)
                      politician_2,                                                       #politician name
                      alleg_treatment_2,                                                   #allegations  
                      
                      support_exp2, support_exp2_nodonation,                                                     #index outcomes
                      support_1_OR:support_4_OR,                                      #individual outcomes
                      
                      offensive_OR,                                                       #new outcomes for all

                      #both experiments
                      media_literacy, digital_literacy,                    #news media and digital literacy
                      
                      concern_FN, detect_FN,                                           #new covariates
                      cancel_culture, political_correctness, accountability,
                      
                      trust_gov, trust_pol, trust_others, trust_opp,                   #new exploratory outcomes
                      clicked1,
                      
                      copartisan_1, copartisan_2, moderate,                           #partisanship variables
                      antipartisan_1, antipartisan_2,
                      partisan_1, partisan_2, 
                      pre_partisan_1, pre_partisan_2,
                      partisan_1_lean, partisan_2_lean,
                      pre_partisan_1_lean, pre_partisan_2_lean,
                      party_3, party_3_lean,
                      pre_party_3, pre_party_3_lean,
                      
                      party, gender, race, age, education, income, region,               #demographics

) 
```

#####Export File
```{r export file}
write_rds(df, path = "data/df_followup_clean.rds")
```
